library(here); library(plyr); library(tidyverse); library(ggpubr); library(stargazer); library(lubridate)

# Get results files ----
load(here::here("risp-rr", "results_wordfish.Rdata"))
load(here::here("risp-rr", "results_stm.Rdata"))
load(here::here("risp-rr", "dat_transcripts_alltopics.Rdata"))

# Creating the results dataframe------------------------------------------------------------------------
doc_scores <- tibble(
  omega = results_wf$theta, 
  omega_se = results_wf$se.theta,
  alpha = results_wf$alpha,
  id = results_wf$docs
) %>% 
  tidyr::separate(col = id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy)

word_scores <- tibble(
  psi = results_wf$psi, 
  beta = results_wf$beta, 
  words = results_wf$features
)


# Table 2 tops words ------------------
# Top "soft" news words
word_scores %>% arrange(desc(beta))

# Top "hard" news words
word_scores %>% arrange(beta)


# Average omega by channel -------
doc_scores %>% 
  group_by(TVchannel) %>% 
  summarise(
    TVscores = mean(omega, na.rm = TRUE)
  )


# Figure discontinuity ------------------------------------------------
load(file=here::here("data","results_ale.RData"))

result.omega <- as.data.frame(results@theta)
result.omega$id <- results@docs
names(result.omega) <- c("omega","id")
TVchannels<-strsplit(result.omega$id, "[.]")
TVchannels<-as.character(lapply(TVchannels,function(l) l[[1]]))
tmp <- strsplit(result.omega$id, "[.]")
TVdate<-do.call(rbind, tmp)[,2]
result.omega$date<-as.Date(TVdate)
result.omega$TVchannel<-TVchannels
load(here::here("data","directors.Rdata"))
directors$Titolo<-gsub("rai1","TG1", directors$Titolo)
directors$Titolo<-gsub("rai2","TG2", directors$Titolo)
directors$Titolo<-gsub("rai3","TG3", directors$Titolo)
directors$Titolo<-gsub("rete4","TG4", directors$Titolo)
directors$Titolo<-gsub("canale5","TG5", directors$Titolo)
directors$Titolo<-gsub("italia1","StudioAperto", directors$Titolo)
directors$Titolo<-gsub("la7","TG7", directors$Titolo)
names(directors)<-c("TVchannel", "pol.affiliation","Director","date")
library(plyr)
result.omega.ordered<-result.omega[order(result.omega$date),]
result.omega<-join(result.omega.ordered, directors, by=c("date","TVchannel"), type='left', match='first' )

result.omega.ordered=result.omega.ordered %>% mutate(mycolor.omega = ifelse(omega>0, "type1", "type2"))

result.omega.ordered.TG1=filter(result.omega.ordered, TVchannel=="TG1")
result.omega.ordered.TG2=filter(result.omega.ordered, TVchannel=="TG2")
result.omega.ordered.TG3=filter(result.omega.ordered, TVchannel=="TG3")

result.omega.ordered$Rai<-as.numeric(result.omega.ordered$TVchannel=="TG1" | result.omega.ordered$TVchannel=="TG2" | result.omega.ordered$TVchannel=="TG3")
result.omega.ordered.Rai=filter(result.omega.ordered, Rai==1)
result.omega.ordered.Rai.mean<-aggregate(result.omega.ordered.Rai$omega, list(data=result.omega.ordered.Rai$date), mean)
result.omega.ordered.Rai.mean=result.omega.ordered.Rai.mean %>% mutate(mycolor.omega = ifelse(x>0, "type1", "type2"))
freq_plot_rai<-ggplot(result.omega.ordered.Rai.mean, aes(x=data, y=x)) +
  geom_segment( aes(x=data, xend=data, y=0, yend=x, color=mycolor.omega), size=0.3, alpha=0.9) +
  theme_light() +
  geom_vline(xintercept=as.numeric(result.omega.ordered$date[4700]), linetype="dashed", size=0.7 )+
  theme(
    legend.position = "none",
    panel.border = element_blank(),
  ) +
  annotate("text", x=result.omega.ordered$date[7000],  y=1.8, label="Berlusconi Resigns", colour="Black", vjust = 1.2, angle=0, size=4)+
  xlab("") +
  ylab("Average Omega Score")+
  ggtitle("Rai, Public")

result.omega.ordered.Mediaset=filter(result.omega.ordered, Rai==0)
result.omega.ordered.Mediaset.mean<-aggregate(result.omega.ordered.Mediaset$omega, list(data=result.omega.ordered.Mediaset$date), mean)
result.omega.ordered.Mediaset.mean=result.omega.ordered.Mediaset.mean %>% mutate(mycolor.omega = ifelse(x>0, "type1", "type2"))
freq_plot_mediaset<-ggplot(result.omega.ordered.Mediaset.mean, aes(x=data, y=x)) +
  geom_segment( aes(x=data, xend=data, y=0, yend=x, color=mycolor.omega), size=0.3, alpha=0.9) +
  theme_light() +
  geom_vline(xintercept=as.numeric(result.omega.ordered$date[4700]), linetype="dashed", size=0.7 )+
  theme(
    legend.position = "none",
    panel.border = element_blank(),
  ) +
  annotate("text", x=result.omega.ordered$date[7000],  y=1.8, label="Berlusconi Resigns", colour="Black", vjust = 1.2, angle=0, size=4)+
  xlab("") +
  ylab("Average Omega Score")+
  ggtitle("Mediaset, Berlusconi owned")

result.omega.ordered.TG7=filter(result.omega.ordered, TVchannel=="TG7")
result.omega.ordered.TG7.mean<-aggregate(result.omega.ordered.TG7$omega, list(data=result.omega.ordered.TG7$date), mean)
result.omega.ordered.TG7.mean=result.omega.ordered.TG7.mean %>% mutate(mycolor.omega = ifelse(x>0, "type1", "type2"))
freq_plot_TG7<-ggplot(result.omega.ordered.TG7.mean, aes(x=data, y=x)) +
  geom_segment( aes(x=data, xend=data, y=0, yend=x, color=mycolor.omega), size=0.3, alpha=0.9) +
  theme_light() +
  geom_vline(xintercept=as.numeric(result.omega.ordered$date[4700]), linetype="dashed", size=0.7 )+
  theme(
    legend.position = "none",
    panel.border = element_blank(),
  ) +
  annotate("text", x=result.omega.ordered$date[7000],  y=1.8, label="Berlusconi Resigns", colour="Black", vjust = 1.2, angle=0, size=4)+
  xlab("") +
  ylab("Average Omega Score")+
  ggtitle("TG7")

require(gridExtra)
pdf("Omega_TimeSeries.pdf")
grid.arrange(freq_plot_rai, freq_plot_mediaset, freq_plot_TG7, nrow=3)  
dev.off()

# Validation: RAI labels ----------------------------------------------
issue_cov <- dat %>%
  # Calculate the total duration for each doc_name_long and MacroArgomento
  group_by(doc_name_long, MacroArgomento) %>%
  summarize(total_duration_per_group = sum(duration)) %>%
  ungroup() %>%
  # Calculate the total duration across all MacroArgomento for each doc_name_long
  group_by(doc_name_long) %>%
  mutate(total_duration_per_doc = sum(total_duration_per_group)) %>%
  ungroup() %>%
  # Calculate the share of each MacroArgomento
  mutate(share = total_duration_per_group / total_duration_per_doc) %>% 
  select(doc_name_long, MacroArgomento, share) %>% 
  tidyr::pivot_wider(
    names_from = MacroArgomento, 
    values_from = share,
    names_prefix = "issue_"
  ) %>% replace(is.na(.), 0) %>%  # Replace NA with 0 if any MacroArgomento is not present for a doc_name_long
  rename(issue_TempoLibero = `issue_Tempo Libero`)

# Step 1: Join the data frames
combined_data <- left_join(issue_cov, doc_scores, by = c("doc_name_long" = "id"))

# Step 2: Gather the issue columns into long format and filter out zeros
long_data <- combined_data %>%
  select(doc_name_long, starts_with("issue_"), omega) %>%
  gather(key = "MacroArgomento", value = "share", -doc_name_long, -omega) %>%
  filter(share > 0) %>% 
  filter(!MacroArgomento=="issue_NA")    # n. 5 NAs 

# Cut the share into intervals
long_data$share_category <- cut(long_data$share,
                                breaks = c(-Inf, 0.25, 0.5, 0.75, Inf),
                                labels = c("0 - 0.25", "0.25 - 0.5", "0.5 - 0.75", "> 0.75"),
                                include.lowest = TRUE)

# Rename the MacroArgomento for nicer plot labels if needed
nice_names <- c("issue_Cronaca" = "Crime news and events",
                "issue_Esteri" = "Foreign Affairs",
                "issue_Giustizia" = "Legal Affairs",
                "issue_Politica" = "Politics",
                "issue_TempoLibero" = "Lifestyle",
                "issue_Economia" = "Economy")
long_data$MacroArgomento <- nice_names[long_data$MacroArgomento]
long_data$MacroArgomento <- fct_relevel(long_data$MacroArgomento, "Crime news and events", "Lifestyle", "Foreign Affairs", "Legal Affairs", "Economy", "Politics")

# Step 3: Create a composite plot with boxplots
p <- ggplot(long_data, aes(x = share, y = omega)) +
  geom_jitter(width = 0.1, alpha = 0.2, size = .5) +  # Smaller and more transparent dots
  geom_boxplot(aes(group = share_category), alpha = 0.85, color = "black", fill = "white") +  # Boxplot with transparency
  facet_wrap(~ MacroArgomento, scales = "free_x") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.25), labels = scales::percent_format(scale = 1)) +
  scale_y_continuous(limits = c(-3, 3)) +
  theme_minimal() +
  labs(x = "Proportion of issue coverage by issue for each news edition", y = "Omega score") +
  theme(strip.text.x = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.x = element_blank(), # Remove the vertical grid lines
        panel.grid.minor.x = element_blank())
ggsave(plot = p, filename=here::here("figs", "figure1-omega-topics.png"), device = 'png', width = 12, height = 8, dpi = 300)

# Validation: stm topics  ----------------------------------------------
library(stm)
load(here::here("risp-rr", "dfm_clean_stem_alltopics.Rdata"))

# Step 1: Determine the dominant topic for each document
# We'll use which.max to find the index of the maximum value in each row (document)
dominant_topic <- apply(results_stm$theta, 1, which.max)
document_ids <- dfm_clean_stem@docvars$docname_

# Step 2: Summarize the most representative words for each topic
# Assuming the function labelTopics can give us the most probable words for each topic
topics_labels <- tibble(dominant_topic = as.factor(1:20), 
                        labels = apply(labelTopics(results_stm, n = 5)$frex, 1, function(x) paste(x, collapse = ", ")))
dominant_topic <- tibble(doc_id = dfm_clean_stem@docvars$docname_, 
                             dominant_topic = as.factor(apply(results_stm$theta, 1, which.max))) %>% 
  dplyr::left_join(y = topics_labels, by = "dominant_topic")


# Step 3: Join the dominant topics with their respective omega scores
# Note: Ensure that 'results_wf$docs' and row names of 'results_stm$theta' are in the same order
omega_scores <- tibble(doc_id = results_wf$docs, omega = results_wf$theta)
combined_data <- dplyr::left_join(x = dominant_topic, y = omega_scores, by = "doc_id") %>% 
  tidyr::separate(col = doc_id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy)

# Step 4: Create a dot plot of omega scores by topic
avg_omega_per_topic <- combined_data %>%
  group_by(dominant_topic) %>%
  summarize(
    avg_omega = mean(omega, na.rm = TRUE), 
    lower_ci = mean(omega, na.rm = TRUE) - qt(0.975, df=n()-1) * sd(omega, na.rm = TRUE)/sqrt(n()),
    upper_ci = mean(omega, na.rm = TRUE) + qt(0.975, df=n()-1) * sd(omega, na.rm = TRUE)/sqrt(n()),
    .groups = 'drop'
    ) %>% 
  dplyr::left_join(y = topics_labels, by = "dominant_topic") %>% 
  mutate(labels = fct_reorder(labels, avg_omega))


# Now plotting
dotplot <- ggplot(avg_omega_per_topic, aes(x = avg_omega, y = labels)) +
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.25) +  # Horizontal error bars for the confidence intervals
  geom_point(size = 2, shape = 16) +  # Black dots for the average scores
  labs(x = "Omega Score", y = "Topic (Most Representative Words)") +
  theme_minimal() +
  theme(legend.position = "none")  # Omit the legend
ggsave(plot = dotplot, filename = here::here("figs", "figure2.png"), width = 8, height = 8, dpi = 300, bg = "white")



# Validation zero-shot ------------------------
doc_scores
zs <- readr::read_csv(here::here("risp-rr", "dat_subset_zs_pred.csv"))
zs <- as_tibble(zs) %>% select(-c(HardSoftCategory, HardSoftCategory_2, ada_embedding)) %>% rename(id=doc_name_long)

combined_data <- zs %>% 
  left_join(doc_scores, by = "id") %>% 
  filter(zs_hard_similarity > 0.2 & zs_soft_similarity > 0.2) %>% 
  mutate(zs_pred01 = ifelse(zs_predicted_class == "Soft", 1, 0))
  
ggplot(combined_data, aes(x = zs_predicted_class, y = omega)) +
  geom_boxplot() +
  labs(x = "Predicted Class: zero-shot classification from Open AI word embeddings", y = "Wordfish Omega Score") +
  theme_minimal()

fit <- glm(zs_pred01 ~ omega, data = combined_data, family = "binomial")
combined_data %>% mutate(
  pred = predict(fit, type = "response")
)
predictions <- predict(fit, type = "response")

# Model Evaluation
caret::confusionMatrix(table(round(predictions), combined_data$zs_pred01))

library(pROC)
roc_curve <- roc(combined_data$zs_pred01, predictions)
plot(roc_curve)
auc(roc_curve)

# Summary of the model
summary(fit)

# Predictions
predictions <- predict(fit, type = "response")

# Model Evaluation
confusionMatrix(table(round(predictions), combined_data$zs_class_binary))



# Diff-in-diff --------------------------------
library(dplyr); library(plm); library(splines)
load(here::here("risp-rr", "results_wordfish.Rdata"))

doc_scores <- tibble(
  omega = results_wf$theta, 
  omega_se = results_wf$se.theta,
  alpha = results_wf$alpha,
  id = results_wf$docs
) %>% 
  tidyr::separate(col = id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy) %>% 
  filter(!TVchannel == "TG7") %>% 
  mutate(
    date = as.Date(date),  # Ensure the date column is in Date format
    lambda_RAI = as.numeric(ifelse(TVchannel %in% c("TG1", "TG2", "TG3"), 1, 0)),  # Create RAI dummy
    Post_t = as.numeric(date > as.Date("2011-11-12")),  # Create Post dummy
    Rai_Post = lambda_RAI * Post_t  # Interaction term
  )

# Step 3: Calculate bi-weeks since the start for the splines
# Assuming the earliest date in your dataset is the start
start_date <- min(doc_scores$date)
# Calculate the number of weeks since the start date
doc_scores$biweeks_since_start <- floor(as.numeric(difftime(doc_scores$date, start_date, units = "weeks")) / 2)

# Define the knots for the splines - here we define them bi-weekly
max_biweeks <- max(doc_scores$biweeks_since_start)
knots <- seq(0, max_biweeks, by = 2)

did_model_1 <- felm(omega ~ Post_t + Rai_Post |    # lambda_RAI perefctly multicollinear due to TVchannel FE.  
                    TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
  data = doc_scores
)
summary(did_model_1)
did_model_2 <- felm(omega ~ Post_t + Rai_Post +    # lambda_RAI perefctly multicollinear due to TVchannel FE.  
                      bs(biweeks_since_start, knots = knots, degree = 3) | 
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(did_model_2)
did_model_3 <- felm(omega ~ Post_t * TVchannel +
                      bs(biweeks_since_start, knots = knots, degree = 3) | 
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(did_model_3)
table4 <- stargazer::stargazer(did_model_1, did_model_2, did_model_3, type = "latex", style = "apsr",
          star.cutoffs = c(.05, .01, .001), 
          keep = c("^Rai_Post$", "^Post_t:TVchannelTG1$", "^Post_t:TVchannelTG2$", "^Post_t:TVchannelTG3$"))
save(table4, file = "tables/table4.tex")



# Robustness: omitting four days of instability? -----------------
rob_1 <- felm(omega ~ Post_t + Rai_Post |   
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(rob_1)
rob_2 <- felm(omega ~ Post_t + Rai_Post |   
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = filter(doc_scores, !(date >= as.Date("2011-11-12") & date <= as.Date("2011-11-16")))
)
summary(did_model_1)
stargazer(rob_1, rob_2, type = "text", style = 'apsr', star.cutoffs = c(.05, .01, .001))


# Robustness: la7 instead of Mediaset -------

# Diff-in-diff --------------------------------
library(dplyr); library(plm); library(splines)
load(here::here("risp-rr", "results_wordfish.Rdata"))

tg7 <- tibble(
  omega = results_wf$theta, 
  omega_se = results_wf$se.theta,
  alpha = results_wf$alpha,
  id = results_wf$docs
) %>% 
  tidyr::separate(col = id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy) %>% 
  filter(TVchannel %in% "TG7") %>% 
  mutate(
    date = as.Date(date),  # Ensure the date column is in Date format
    Post_t = as.numeric(date > as.Date("2011-11-12")),  # Create Post dummy
  )


# Step 3: Calculate bi-weeks since the start for the splines
# Assuming the earliest date in your dataset is the start
start_date <- min(doc_scores$date)
# Calculate the number of weeks since the start date
doc_scores$biweeks_since_start <- floor(as.numeric(difftime(doc_scores$date, start_date, units = "weeks")) / 2)

# Define the knots for the splines - here we define them bi-weekly
max_biweeks <- max(doc_scores$biweeks_since_start)
knots <- seq(0, max_biweeks, by = 2)

did_model_1 <- felm(omega ~ Post_t + Rai_Post |    # lambda_RAI perefctly multicollinear due to TVchannel FE.  
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(did_model_1)
did_model_2 <- felm(omega ~ Post_t + Rai_Post +    # lambda_RAI perefctly multicollinear due to TVchannel FE.  
                      bs(biweeks_since_start, knots = knots, degree = 3) | 
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(did_model_2)
did_model_3 <- felm(omega ~ Post_t * TVchannel +
                      bs(biweeks_since_start, knots = knots, degree = 3) | 
                      TVchannel | 0 | id,  # Fixed effects for TV channels and clustering by id
                    data = doc_scores
)
summary(did_model_3)
stargazer::stargazer(did_model_1, did_model_2, did_model_3, type = "text", style = "apsr",
                               star.cutoffs = c(.05, .01, .001), 
                               keep = c("^Rai_Post$", "^Post_t:TVchannelTG1$", "^Post_t:TVchannelTG2$", "^Post_t:TVchannelTG3$"))

# Figure 2 -----------
library(lfe); library(dplyr); library(splines); library(ggplot2); library(broom)

# doc_scores <- doc_scores %>% 
#   filter(date > "2011-05-31" & date < "2012-05-01")
# Assuming 'date' is already of class Date
doc_scores$year_month <- format(doc_scores$date, "%Y-%m")
doc_scores$months_since_start <- as.numeric(format(doc_scores$date, "%Y%m")) - min(as.numeric(format(doc_scores$date, "%Y%m")))

# Create monthly dummy variables
doc_scores$month_year_factor <- factor(doc_scores$year_month)

# Create the RAI and Post interaction term for each month
doc_scores <- doc_scores %>%
  group_by(month_year_factor) %>%
  mutate(interaction = lambda_RAI * Post_t) %>%
  ungroup()

# Create the interaction terms for the event-study
interactions <- model.matrix(~ month_year_factor * lambda_RAI - 1, data = doc_scores)

# Estimate the event-study model
event_study_model <- lm(omega ~ lambda_RAI + Post_t + interactions, data = doc_scores)

# Summarize the results
summary(event_study_model)

# Assuming 'event_study_model' contains the results of the model estimation
coef_summary <- summary(event_study_model)

# Extract coefficients and confidence intervals for the interaction terms
# Assuming 'interactions' is the matrix of interaction terms used in the model
coef_df <- tidy(coef_summary)$term[grepl("month_year_factor", tidy(coef_summary)$term)] %>%
  data.frame(term = .) %>%
  left_join(tidy(coef_summary), by = "term") %>%
  mutate(
    conf.low = estimate - qnorm(p = 0.65) * std.error,
    conf.high = estimate + qnorm(p = 0.65) * std.error,
    date = str_extract_all(pattern = "\\d{4}-\\d{2}", term),
    date = as.Date(paste0(date, "-01")),  # Assuming the format is "YYYY-MM"
    is_reference = date == as.Date("2011-11-01")  # Identify the reference category
  ) %>% 
  filter(str_detect(pattern = "\\:lambda", term)) %>% 
  arrange(date) %>% 
  filter(date >= as.Date("2011-06-01") & date <= as.Date("2012-04-30"))

# Reorder the data by date for plotting
coef_df <- coef_df[order(coef_df$date), ]
coeffrescale <- coef_df[coef_df$is_reference==TRUE, "estimate"]
coef_df <- coef_df %>% 
  mutate(
    estimate = estimate - coeffrescale, 
    conf.low = conf.low - coeffrescale, 
    conf.high = conf.high - coeffrescale
  )

# Create the coefficient plot
ggplot(coef_df, aes(x = date, y = estimate)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_point(aes(color = is_reference), size = 3) +  # Black dots for the estimates
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0, color = "black") +
  geom_vline(xintercept = as.numeric(as.Date("2011-11-01")), color = "red", linetype = "solid") +
  scale_color_manual(values = c("black", "red")) +
  theme_minimal() +
  labs(x = "Date", y = "Coefficients") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b", limits = c(as.Date("2011-06-01"), as.Date("2012-04-01"))) 
  annotate("text", x = as.Date("2011-11-01"), y = 0, label = "Berlusconi resigns", angle = 90, vjust = 1)




# Mechanisms ---------
load(here::here("risp-rr", "dat_transcripts_alltopics.Rdata"))
dat <- dat %>% 
  filter(MacroArgomento %in% c("Politica", "Economia")) %>% 
  select(doc_name_long, Time.Start, Time.Stop, duration, MacroArgomento) %>% 
  rename(id = doc_name_long) %>% 
  mutate(duration_min = duration*60)

load(here::here("risp-rr", "results_wordfish.Rdata"))
doc_scores <- tibble(
  omega = results_wf$theta, 
  omega_se = results_wf$se.theta,
  alpha = results_wf$alpha,
  id = results_wf$docs
) %>% 
  tidyr::separate(col = id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy) %>% 
  filter(!TVchannel == "TG7") %>% 
  mutate(
    date = as.Date(date),  # Ensure the date column is in Date format
    lambda_RAI = as.numeric(ifelse(TVchannel %in% c("TG1", "TG2", "TG3"), 1, 0)),  # Create RAI dummy
    Post_t = as.numeric(date > as.Date("2011-11-12")),  # Create Post dummy
    Rai_Post = lambda_RAI * Post_t  # Interaction term
  )

combined_data <- doc_scores %>% 
  left_join(dat, by = "id")

combined_data <- combined_data %>% 
  group_by(date, Post_t, lambda_RAI, MacroArgomento) %>% 
  summarize(
    omega = mean(omega, na.rm=TRUE), 
    cum_air = sum(duration_min, na.rm=TRUE)
  )

m1 <- lm(cum_air ~ lambda_RAI*Post_t, data = filter(combined_data, MacroArgomento=="Economia"))
m2 <- lm(cum_air ~ lambda_RAI*Post_t, data = filter(combined_data, MacroArgomento=="Politica"))
m3 <- lm(omega ~ lambda_RAI*Post_t, data = filter(combined_data, MacroArgomento=="Economia"))
m4 <- lm(omega ~ lambda_RAI*Post_t, data = filter(combined_data, MacroArgomento=="Politica"))

stargazer(m1, m2, m3, m4, type = "text")

# Table 6: ITANES estimats --------------------------------------------------------------------------------
library("rio"); library("sjmisc"); library("stringr"); library("dplyr")

itanes <- rio::import(here::here("data", "processed","panel.dta"))
load(here::here("risp-rr", "results_wordfish.Rdata"))

# Creating the results dataframe------------------------------------------------------------------------
doc_scores <- tibble(
  omega = results_wf$theta, 
  omega_se = results_wf$se.theta,
  alpha = results_wf$alpha,
  id = results_wf$docs
) %>% 
  tidyr::separate(col = id, into = c("TVchannel", "date", "edition", "dummy"), sep = "_", remove = FALSE) %>% 
  select(-dummy)

# Omega scores
load(file.path(here::here(), "data", "processed", "results_completedfm.RData"))
omega <- data.frame(document=results@docs,
                    theta=results@theta,
                    se=results@se.theta, stringsAsFactors = F)
omega$tg <- sapply(strsplit(omega$document, "\\."), "[[", 1)
omega$date <- sapply(strsplit(omega$document, "\\."), "[[", 2)
omega$hour <- sapply(strsplit(omega$document, "\\."), "[[", 3)
omega$date <- as.Date(omega$date)

# Figure 1 - Content scores by network --------------------------------------------------------------------------------------------
result.omega.ordered <- result.omega[order(result.omega$date),]
result.omega.ordered$Rai<-as.numeric(result.omega.ordered$TVchannel=="TG1" | result.omega.ordered$TVchannel=="TG2" | result.omega.ordered$TVchannel=="TG3")
result.omega.ordered.Rai=filter(result.omega.ordered, Rai==1)
result.omega.ordered.Rai.mean<-aggregate(result.omega.ordered.Rai$omega, list(data=result.omega.ordered.Rai$date), mean)
result.omega.ordered.Rai.mean=result.omega.ordered.Rai.mean %>% mutate(mycolor.omega = ifelse(x>0, "type1", "type2"))
freq_plot_rai <- ggplot(result.omega.ordered.Rai.mean, aes(x=data, y=x)) +
        geom_segment( aes(x=data, xend=data, y=0, yend=x, color=mycolor.omega), size=0.3, alpha=0.9) +
        theme_bw() +
        scale_color_grey() + 
        geom_vline(xintercept=as.numeric(result.omega.ordered$date[4700]), linetype="dashed", size=0.7 )+
        theme(legend.position = "none", panel.border = element_blank()) +
        ggplot2::annotate("text", x=as.Date("2012-04-01"),  y=1.8, label="Berlusconi resigns", colour="grey10", vjust = 1.2, angle=0, size=3)+
        xlab("") +
        ylab("Average Omega Score")+
        ggtitle("Rai")

result.omega.ordered.Mediaset=filter(result.omega.ordered, Rai==0)
result.omega.ordered.Mediaset.mean<-aggregate(result.omega.ordered.Mediaset$omega, list(data=result.omega.ordered.Mediaset$date), mean)
result.omega.ordered.Mediaset.mean=result.omega.ordered.Mediaset.mean %>% mutate(mycolor.omega = ifelse(x>0, "type1", "type2"))
freq_plot_mediaset <- ggplot(result.omega.ordered.Mediaset.mean, aes(x=data, y=x)) +
        geom_segment( aes(x=data, xend=data, y=0, yend=x, color=mycolor.omega), size=0.3, alpha=0.9) +
        theme_bw() +
        scale_color_grey() + 
        geom_vline(xintercept=as.numeric(result.omega.ordered$date[4700]), linetype="dashed", size=0.7 )+
        theme(legend.position = "none", panel.border = element_blank()) +
        ggplot2::annotate("text", x=as.Date("2012-04-01"),  y=1.8, label="Berlusconi resigns", colour="grey10", vjust = 1.2, angle=0, size=3)+
        xlab("") +
        ylab("Average Omega Score")+
        ggtitle("Mediaset")
require(ggpubr)
Fig_NetworkPlot <- ggpubr::ggarrange(freq_plot_rai, freq_plot_mediaset, nrow=2)
ggsave(Fig_NetworkPlot, filename = file.path(here::here(), "figures","Fig_NetworkPlot.png"),
       device = "png", width = 7, height = 5, units = "in", dpi=600)




# Mechanisms of media bias ------------------------------------------------------------------------
legend <- c("TG7"="grey70", "TG3"="grey60", "TG2"="grey50", "TG1"="grey40", "TG5"="grey30", "TG4"="grey20", "StudioAperto"="grey10")

####
# Current affairs News: --------
####
load(file=file.path(here::here(),"data", "processed","dat.cron.wordfish.RData"))
dat$Titolo <- factor(dat$Titolo)
dat$Titolo <- factor(dat$Titolo, levels(dat$Titolo)[c(2,3,4,5,6,1,7)])
dat$month <- month(dat$date)
dat$year <- year(dat$date)
dat$date_num <- as.factor(paste(dat$month, dat$year, sep="-"))
dat$date_num <- relevel(dat$date_num, ref="7-2010")

# Regression baseline
omega.fit.cron <- lm(theta ~ Titolo,
                data=dat)
# + Month FE
omega.fit2.cron <- lm(theta ~ Titolo + date_num ,
                 data=dat)
stargazer(omega.fit.cron, omega.fit2.cron, 
          style = "apsr", type = "latex", 
          star.cutoffs = c(0.05, 0.01, 0.001),
          digits = 3,
          omit.stat = c("adj.rsq"),
          omit = c("date_num"),
          out = file.path(here::here(),"output", "table_omega_reg_cronaca.tex"),
          add.lines=list(c("Month FE", "", "X")))


# plot
load(file=file.path(here::here(),"data", "processed","dat.cron.wordfish.RData"))
dat.plot <- dat %>% 
        select(theta, date, Titolo) %>% 
        filter(!is.na(theta) & !is.na(Titolo))

dat.plot$Titolo <- fct_reorder(.f = dat.plot$Titolo, .x = dat.plot$theta, .fun = mean, na.rm=TRUE, .desc = TRUE)

# Plotting omega scores:
ScalingCurrAff <- ggplot(dat.plot, aes(x=date, y=theta, colour=Titolo))+ 
        geom_point(size=0.5, alpha=0.4, shape=16) +
        stat_smooth(method="loess", level=0.99, span=0.7) +
        guides(color=guide_legend(title="News program"), title.position="top")+
        theme_bw() + 
        scale_color_manual(values=legend) +
        xlab("")+
        ylab("Omega scores")+
        ylim(-2,+2) +
        ggtitle("WORDFISH scaling on current affairs news") +
        theme(plot.title = element_text(size = 16)) +
        theme(legend.text=element_text(size=12)) + 
        guides(fill=guide_legend(title=NULL))


# Mean time table
load(file=file.path(here::here(), "data", "processed", "dat_transcripts_alltopics.Rdata"))
dat$Titolo <- factor(dat$Titolo)
dat$Titolo <- factor(dat$Titolo, levels(dat$Titolo)[c(2,3,4,5,6,1,7)])
dat$month <- month(dat$date)
dat$year <- year(dat$date)
dat$date_num <- as.factor(paste(dat$month, dat$year, sep="-"))
dat$date_num <- relevel(dat$date_num, ref="7-2010")

cron.dur <- dat %>% 
        select(MacroArgomento, Titolo, date, duration, date_num) %>% 
        filter(MacroArgomento=="Cronaca" & !is.na(Titolo)) %>%
        arrange(date) %>% 
        group_by(date_num, Titolo) %>% 
        summarise(duration=sum(duration))

# Average dayli airtime spent on cronaca by TG: 
tapply(X = cron.dur$duration, INDEX = list(cron.dur$Titolo), FUN = mean, na.rm=TRUE)

# Baseline reg
cron.fit <- lm(duration ~ Titolo,
                data=cron.dur)
# + Month FE
cron.fit2 <- lm(duration ~ Titolo + date_num ,
                 data=cron.dur)

stargazer(cron.fit, cron.fit2, 
          style = "apsr", type = "latex", 
          star.cutoffs = c(0.05, 0.01, 0.001),
          digits = 3,
          omit.stat = c("adj.rsq"),
          omit = c("date_num"),
          out = file.path(here::here(),"output", "table_omega_reg_cron_duration.tex"),
          add.lines=list(c("Month FE", "", "X")))


# Plotting time duration:
load(file=file.path(here::here(), "data", "processed", "dat_transcripts_alltopics.Rdata"))
dat$Titolo <- factor(dat$Titolo)
cron.dur <- dat %>% 
        select(MacroArgomento, Titolo, date, duration) %>% 
        filter(MacroArgomento=="Cronaca" & !is.na(Titolo)) %>%
        arrange(date) %>% 
        group_by(date, Titolo) %>% 
        summarise(duration=sum(duration))
cron.dur$cum_dur <- ave(cron.dur$duration, cron.dur$Titolo, FUN=cumsum)

library(forcats)
cron.dur$Titolo <- fct_reorder(.f = cron.dur$Titolo, .x = cron.dur$cum_dur, .fun = mean, na.rm=TRUE, .desc = TRUE)

DuratCurrAff <- ggplot(cron.dur, aes(x=date, y=cum_dur, colour=Titolo))+ 
        geom_point(size=0.5, alpha=0.4, shape=16) +
        stat_smooth(method="loess", level=0.99, span=0.7) +
        guides(color=guide_legend(title="News program"), title.position="top")+
        theme_bw() + 
        scale_color_manual(values=legend) +
        ggtitle("Airtime current affairs coverage") +
        ylab("Cumulative time (in hours)") + xlab("") +
        theme(plot.title = element_text(size = 16)) +
        theme(legend.text=element_text(size=12)) + 
        guides(fill=guide_legend(title=NULL))

Fig_CrimeNews <- ggpubr::ggarrange(ScalingCurrAff, DuratCurrAff, nrow = 2,common.legend = TRUE, legend="right")
ggsave(Fig_CrimeNews, filename = file.path(here::here(), "figures","Fig_CrimeNews.png"),
       device = "png", width = 7, height = 5, units = "in", dpi=600)
ggsave(Fig_CrimeNews, filename = file.path(here::here(), "figures","Fig_CrimeNews.pdf"),
       device = "pdf", width = 7, height = 5, units = "in", dpi=600)


####
# Economic news  -----------------
####

load(file=file.path(here::here(),"data", "processed","dat.econ.wordfish.RData"))
dat$Titolo <- factor(dat$Titolo)
dat$Titolo <- factor(dat$Titolo, levels(dat$Titolo)[c(2,3,4,5,6,1,7)])
dat$month <- month(dat$date)
dat$year <- year(dat$date)
dat$date_num <- as.factor(paste(dat$month, dat$year, sep="-"))
dat$date_num <- relevel(dat$date_num, ref="7-2010")

# Regression baseline
omega.fit.econ <- lm(theta ~ Titolo,
                     data=dat)
# + Month FE
omega.fit2.econ <- lm(theta ~ Titolo + date_num ,
                      data=dat)
stargazer(omega.fit.econ, omega.fit2.econ, 
          style = "apsr", type = "latex", 
          star.cutoffs = c(0.05, 0.01, 0.001),
          digits = 3,
          omit.stat = c("adj.rsq"),
          omit = c("date_num"),
          out = file.path(here::here(),"output", "table_omega_reg_economia.tex"),
          add.lines=list(c("Month FE", "", "X")))



load(file=file.path(here::here(),"data", "processed","dat.econ.wordfish.RData"))
dat.plot <- dat %>% 
        select(theta, date, Titolo) %>% 
        filter(!is.na(theta) & !is.na(Titolo))
dat.plot$Titolo <- fct_reorder(.f = dat.plot$Titolo, .x = dat.plot$theta, .fun = mean, na.rm=TRUE, .desc = TRUE)

# Plotting omega scores
ScalingEcon <- ggplot(dat.plot, aes(x=date, y=theta, colour=Titolo))+ 
        geom_point(size=0.5, alpha=0.4, shape=16) +
        stat_smooth(method="loess", level=0.99, span=0.7) +
        guides(color=guide_legend(title="News program"), title.position="top")+
        theme_bw() + 
        scale_color_manual(values=legend) +
        ylab("Omega scores")+ xlab("")+
        ylim(-2,+2) +
        ggtitle("WORDFISH scaling on economic news") +
        theme(plot.title = element_text(size = 16)) +
        theme(legend.text=element_text(size=12)) + 
        guides(fill=guide_legend(title=NULL))


# Mean time table
load(file=file.path(here::here(), "data", "processed", "dat_transcripts_alltopics.Rdata"))
dat$Titolo <- factor(dat$Titolo)
dat$Titolo <- factor(dat$Titolo, levels(dat$Titolo)[c(2,3,4,5,6,1,7)])
dat$month <- month(dat$date)
dat$year <- year(dat$date)
dat$date_num <- as.factor(paste(dat$month, dat$year, sep="-"))
dat$date_num <- relevel(dat$date_num, ref="7-2010")

econ.dur <- dat %>% 
        select(MacroArgomento, Titolo, date, duration, date_num) %>% 
        filter(MacroArgomento=="Economia" & !is.na(Titolo)) %>%
        arrange(date) %>% 
        group_by(date_num, Titolo) %>% 
        summarise(duration=sum(duration))

# Average dayli airtime spent on econaca by TG: 
tapply(X = econ.dur$duration, INDEX = list(econ.dur$Titolo), FUN = mean, na.rm=TRUE)

# Baseline reg
econ.fit <- lm(duration ~ Titolo,
               data=econ.dur)
# + Month FE
econ.fit2 <- lm(duration ~ Titolo + date_num ,
                data=econ.dur)

stargazer(econ.fit, econ.fit2, 
          style = "apsr", type = "latex", 
          star.cutoffs = c(0.05, 0.01, 0.001),
          digits = 3,
          omit.stat = c("adj.rsq"),
          omit = c("date_num"),
          out = file.path(here::here(),"output", "table_omega_reg_econ_duration.tex"),
          add.lines=list(c("Month FE", "", "X")))

# Time duration of news
detach(plyr)
load(file=file.path(here::here(), "data", "processed", "dat_transcripts_alltopics.Rdata"))
dat$Titolo <- factor(dat$Titolo)

econ.dur <- dat %>% 
        select(MacroArgomento, Titolo, date, duration) %>% 
        filter(MacroArgomento=="Economia" & !is.na(Titolo)) %>%
        arrange(date) %>% 
        group_by(date, Titolo) %>% 
        summarise(duration=sum(duration))
econ.dur$cum_dur <- ave(econ.dur$duration, econ.dur$Titolo, FUN=cumsum)
econ.dur$Titolo <- fct_reorder(.f = econ.dur$Titolo, .x = econ.dur$cum_dur, .fun = mean, na.rm=TRUE, .desc = TRUE)

DuratEcon <- ggplot(econ.dur, aes(x=date, y=cum_dur, colour=Titolo))+ 
        geom_point(size=0.5, alpha=0.4, shape=16) +
        stat_smooth(method="loess", level=0.99, span=0.7) +
        guides(color=guide_legend(title="News program"), title.position="top")+
        theme_bw() + 
        scale_color_manual(values=legend) +
        ggtitle("Airtime economic news") +
        ylab("Cumulative time (in hours)") + xlab("") +
        theme(plot.title = element_text(size = 16)) +
        theme(legend.text=element_text(size=12)) + 
        guides(fill=guide_legend(title=NULL))


Fig_EconNews <- ggpubr::ggarrange(ScalingEcon, DuratEcon, nrow = 2,common.legend = TRUE, legend="right")
ggsave(Fig_EconNews, filename = file.path(here::here(), "figures","Fig_EconNews.png"),
       device = "png", width = 7, height = 5, units = "in", dpi=600)
ggsave(Fig_EconNews, filename = file.path(here::here(), "figures","Fig_EconNews.pdf"),
       device = "pdf", width = 7, height = 5, units = "in", dpi=600)


# All plots in one:
Fig_AllPlots <- ggpubr::ggarrange(Fig_CrimeNews, Fig_EconNews, ncol = 2, common.legend = TRUE, legend="right")
ggsave(Fig_AllPlots, filename = file.path(here::here(), "figures","Fig_Crime_EconNews.png"),
       device = "png", width = 7, height = 5, units = "in", dpi=600)
ggsave(Fig_AllPlots, filename = file.path(here::here(), "figures","Fig_Crime_EconNews.pdf"),
       device = "pdf", width = 7, height = 5, units = "in", dpi=600)